{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a9c43238-2d32-42bc-8d55-2470951d30fd",
   "metadata": {},
   "source": [
    "# How to Sort Unicode Strings Alphabetically in Python?"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1f606efe-37d9-4da2-9b91-0786ee06fae0",
   "metadata": {},
   "source": [
    "## Lexicographic Sorting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "03e763c4-4c4d-40ab-9e26-72eaa7fbab09",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Ludmiła', 'Zbigniew', 'Łukasz', 'Żaneta']"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "polish_names = [\"Zbigniew\", \"Ludmiła\", \"Żaneta\", \"Łukasz\"]\n",
    "sorted(polish_names)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c020b0a1-4a4a-44df-8c6b-332f5c4e57b2",
   "metadata": {},
   "source": [
    "## Unicode Collation Algorithm"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b5cfebf0-3272-4a3a-a45d-fd24e0094994",
   "metadata": {},
   "source": [
    "### pyuca"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "54c6e6b6-18ad-4f68-a791-5b2de4169e41",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Ludmiła', 'Łukasz', 'Żaneta', 'Zbigniew']"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pyuca\n",
    "\n",
    "collator = pyuca.Collator()\n",
    "sorted(polish_names, key=collator.sort_key)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bedd5ce9-3135-46b8-b797-9da4d499c875",
   "metadata": {},
   "source": [
    "### PyICU"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "1c46a105-984e-43f2-8182-7d487e9e9622",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Ludmiła', 'Łukasz', 'Zbigniew', 'Żaneta']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from icu import Collator, Locale\n",
    "\n",
    "collator = Collator.createInstance(Locale(\"pl_PL\"))\n",
    "sorted(polish_names, key=collator.getSortKey)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e60cef37-5a80-4cce-91c7-16801690ea09",
   "metadata": {},
   "source": [
    "#### Locales"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "a18046a8-cc6b-4ef3-84fc-8d58da68c9d8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['af', 'af_NA', 'af_ZA', 'agq', 'agq_CM']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from icu import Locale\n",
    "\n",
    "list(Locale.getAvailableLocales())[:5]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6645de46-7f9d-4f5b-be72-2683d197a527",
   "metadata": {},
   "source": [
    "#### Rule-Based Collator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "969077e1-c524-4173-9cd6-f514b0865244",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Ludmiła', 'Łukasz', 'Zbigniew', 'Żaneta']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from icu import RuleBasedCollator\n",
    "\n",
    "collator = RuleBasedCollator(\n",
    "    \"\"\"\n",
    "    &A<ą<<<Ą\n",
    "    &C<ć<<<Ć\n",
    "    &E<ę<<<Ę\n",
    "    &L<ł<<<Ł\n",
    "    &N<ń<<<Ń\n",
    "    &O<ó<<<Ó\n",
    "    &S<ś<<<Ś\n",
    "    &Z<ź<<<Ź<ż<<<Ż\n",
    "    \"\"\"\n",
    ")\n",
    "sorted(polish_names, key=collator.getSortKey)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "468cdf0d-39c3-47f0-a3ae-54add7f45270",
   "metadata": {},
   "source": [
    "## Python's `locale` Module"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "b5f46445-b166-4858-9559-f9a90c7a9062",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Ludmiła', 'Łukasz', 'Zbigniew', 'Żaneta']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import locale\n",
    "\n",
    "locale.setlocale(locale.LC_COLLATE, \"pl_PL.UTF-8\")\n",
    "sorted(polish_names, key=locale.strxfrm)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0f6cc858-c872-409d-ae07-7587a7eb1412",
   "metadata": {},
   "source": [
    "## Transliteration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "8eb2cbb1-eff4-4f0b-a301-fc39835ffa69",
   "metadata": {},
   "outputs": [],
   "source": [
    "pangrams = {\n",
    "    \"Czech\": \"Příliš žluťoučký kůň úpěl ďábelské ódy\",\n",
    "    \"Polish\": \"Pójdźmyż haftnąć z wklęsłości guberń\",\n",
    "    \"Icelandic\": \"Kæmi ný öxi hér, ykist þjófum nú bæði víl og ádrepa\",\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "82afd3c6-424a-441b-a720-3110c05c75a3",
   "metadata": {},
   "source": [
    "### Python's `unicodedata` Module"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "b328bee5-1da2-4968-9f8e-c27943214820",
   "metadata": {},
   "outputs": [],
   "source": [
    "import unicodedata\n",
    "\n",
    "\n",
    "def transliterate_v1(text: str) -> str:\n",
    "    return (\n",
    "        unicodedata.normalize(\"NFD\", text)\n",
    "        .encode(\"ascii\", errors=\"ignore\")\n",
    "        .decode(\"ascii\")\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "952a8a5c-10bc-47db-b9c0-d136e2246fbd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Czech\n",
      "Příliš žluťoučký kůň úpěl ďábelské ódy\n",
      "Prilis zlutoucky kun upel dabelske ody\n",
      "\n",
      "Polish\n",
      "Pójdźmyż haftnąć z wklęsłości guberń\n",
      "Pojdzmyz haftnac z wklesosci gubern\n",
      "\n",
      "Icelandic\n",
      "Kæmi ný öxi hér, ykist þjófum nú bæði víl og ádrepa\n",
      "Kmi ny oxi her, ykist jofum nu bi vil og adrepa\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for label, pangram in pangrams.items():\n",
    "    print(label, pangram, transliterate_v1(pangram), \"\", sep=\"\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "066447d0-a6ef-4b72-8f3b-327c5716cb82",
   "metadata": {},
   "source": [
    "### Custom Translation Table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "4b0a0b51-3d18-4c59-af87-6d22527ee07e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "\n",
    "\n",
    "def transliterate_v2(text: str, mapping: dict[str, str] = None) -> str:\n",
    "    combining_characters = \"\".join(\n",
    "        character\n",
    "        for code_point in range(sys.maxunicode)\n",
    "        if unicodedata.combining(character := chr(code_point))\n",
    "    )\n",
    "    if mapping:\n",
    "        src, dst = [\"\".join(x) for x in zip(*mapping.items(), strict=False)]\n",
    "        table = str.maketrans(src, dst, combining_characters)\n",
    "    else:\n",
    "        table = str.maketrans(dict.fromkeys(combining_characters))\n",
    "    return unicodedata.normalize(\"NFD\", text).translate(table)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "e7e5852c-3344-4cfc-bea2-28d0eb6c9e26",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Czech\n",
      "Příliš žluťoučký kůň úpěl ďábelské ódy\n",
      "Prilis zlutoucky kun upel dabelske ody\n",
      "\n",
      "Polish\n",
      "Pójdźmyż haftnąć z wklęsłości guberń\n",
      "Pojdzmyz haftnac z wkleslosci gubern\n",
      "\n",
      "Icelandic\n",
      "Kæmi ný öxi hér, ykist þjófum nú bæði víl og ádrepa\n",
      "Kæmi ny oxi her, ykist þjofum nu bæði vil og adrepa\n",
      "\n"
     ]
    }
   ],
   "source": [
    "mapping = {\"Ł\": \"L\", \"ł\": \"l\"}\n",
    "for label, pangram in pangrams.items():\n",
    "    print(label, pangram, transliterate_v2(pangram, mapping), \"\", sep=\"\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e9dc5201-ae4c-4240-bfe7-0b6fd9c47076",
   "metadata": {},
   "source": [
    "### Unidecode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "2b42a243-7adf-442a-820b-f420be5c2b04",
   "metadata": {},
   "outputs": [],
   "source": [
    "import unidecode\n",
    "\n",
    "\n",
    "def transliterate_v3(text: str) -> str:\n",
    "    return unidecode.unidecode(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "349015e1-2167-4521-ac9f-f9b4c0ad7d08",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Czech\n",
      "Příliš žluťoučký kůň úpěl ďábelské ódy\n",
      "Prilis zlutoucky kun upel dabelske ody\n",
      "\n",
      "Polish\n",
      "Pójdźmyż haftnąć z wklęsłości guberń\n",
      "Pojdzmyz haftnac z wkleslosci gubern\n",
      "\n",
      "Icelandic\n",
      "Kæmi ný öxi hér, ykist þjófum nú bæði víl og ádrepa\n",
      "Kaemi ny oxi her, ykist thjofum nu baedi vil og adrepa\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for label, pangram in pangrams.items():\n",
    "    print(label, pangram, transliterate_v3(pangram), \"\", sep=\"\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8bf35bce-44b5-4c1f-851c-f175dfbb9c19",
   "metadata": {},
   "source": [
    "### PyICU Transliterator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "cbda5081-f987-403f-a0a8-7616766e5d86",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Czech\n",
      "Příliš žluťoučký kůň úpěl ďábelské ódy\n",
      "Prilis zlutoucky kun upel dabelske ody\n",
      "\n",
      "Polish\n",
      "Pójdźmyż haftnąć z wklęsłości guberń\n",
      "Pojdzmyz haftnac z wkleslosci gubern\n",
      "\n",
      "Icelandic\n",
      "Kæmi ný öxi hér, ykist þjófum nú bæði víl og ádrepa\n",
      "Kaemi ny oxi her, ykist thjofum nu baedi vil og adrepa\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import icu\n",
    "\n",
    "tr = icu.Transliterator.createInstance(\"Any-ASCII\")\n",
    "\n",
    "for label, pangram in pangrams.items():\n",
    "    print(label, pangram, tr.transliterate(pangram), \"\", sep=\"\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "680566f6-79c9-46a2-8755-fcf2c674435e",
   "metadata": {},
   "source": [
    "## Case-Insensitive Sorting"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d445b3a0-afb7-41c5-95cd-b7c73ce01c2d",
   "metadata": {},
   "source": [
    "### Case Folding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "531999cf-a828-490b-a42b-a0b968280c04",
   "metadata": {},
   "outputs": [],
   "source": [
    "import functools\n",
    "\n",
    "\n",
    "def case_insensitive(text: str) -> str:\n",
    "    nfd = functools.partial(unicodedata.normalize, \"NFD\")\n",
    "    return nfd(nfd(text).casefold())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "29a9f7cf-cb6f-441c-b1f0-8816374bf90d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Éléphant', 'éléphant', 'hérissonne', 'poisson', 'Tortue']"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "animaux = [\"Tortue\", \"hérissonne\", \"Éléphant\", \"poisson\", \"éléphant\"]\n",
    "sorted(animaux, key=case_insensitive)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "034ae6ce-0e6e-46df-b643-8fa578cba15a",
   "metadata": {},
   "source": [
    "### PyICU"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "115bd56c-9a72-4964-985d-d70aa5642444",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Éléphant', 'éléphant', 'hérissonne', 'poisson', 'Tortue']"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from icu import Collator, Locale\n",
    "\n",
    "collator = Collator.createInstance(Locale(\"fr_FR\"))\n",
    "collator.setStrength(Collator.SECONDARY)\n",
    "sorted(animaux, key=collator.getSortKey)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0fa814fb-7e8f-4830-bc23-0b581bc156d3",
   "metadata": {},
   "source": [
    "### pyuca\n",
    "\n",
    "Lowercase letters always come before uppercase letters:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "28cf5632-b03f-41f6-a680-f23a197c7dcd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['éléphant', 'Éléphant', 'hérissonne', 'poisson', 'Tortue']"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from pyuca import Collator\n",
    "\n",
    "sorted(animaux, key=Collator().sort_key)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1be29770-ec9c-46fb-8282-49a92a72a73d",
   "metadata": {},
   "source": [
    "## Natural Sort Order"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "aa4c482b-2650-46d2-a43b-163aa7406e3b",
   "metadata": {},
   "source": [
    "### Regular Expressions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "6522bf3c-a7f3-41f2-ad2a-dda9acb46af7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "\n",
    "def natural_order(text: str) -> tuple[str | int, ...]:\n",
    "    return tuple(\n",
    "        int(chunk) if chunk.isdigit() else chunk\n",
    "        for chunk in re.split(r\"(\\d+)\", text)\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "b68c42d5-9d08-4896-9b0d-3dcb6ea5872d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['log.1',\n",
       " 'log.2',\n",
       " 'log.3',\n",
       " 'log.4',\n",
       " 'log.5',\n",
       " 'log.6',\n",
       " 'log.7',\n",
       " 'log.8',\n",
       " 'log.9',\n",
       " 'log.10',\n",
       " 'log.11',\n",
       " 'log.12',\n",
       " 'log.13',\n",
       " 'log.14',\n",
       " 'log.15']"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sorted([f\"log.{i}\" for i in range(1, 111)], key=natural_order)[:15]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f09a10ff-004b-436a-bb19-6184f1f5db23",
   "metadata": {},
   "source": [
    "### PyICU + natsort"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "c60c52b9-40bf-4fc7-a2a6-51069370c83b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['błędy.log.1',\n",
       " 'błędy.log.3',\n",
       " 'Błędy.log.101',\n",
       " 'Raport_kwiecień_2023.xlsx',\n",
       " 'raport_maj-2023.xlsx']"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import locale\n",
    "\n",
    "from natsort import natsorted, ns\n",
    "\n",
    "filenames = [\n",
    "    \"raport_maj-2023.xlsx\",\n",
    "    \"błędy.log.3\",\n",
    "    \"błędy.log.1\",\n",
    "    \"Raport_kwiecień_2023.xlsx\",\n",
    "    \"Błędy.log.101\",\n",
    "]\n",
    "\n",
    "locale.setlocale(locale.LC_ALL, \"pl_PL.UTF-8\")\n",
    "\n",
    "natsorted(filenames, alg=ns.LOCALE | ns.IGNORECASE)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "110b932b-6187-4625-8bda-b7f9a63d152c",
   "metadata": {},
   "source": [
    "## Sorting Complex Objects"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "719b79fe-f1d7-42c9-8043-e66d43bef002",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Anna Nowak,\n",
       " Anna Wójcik,\n",
       " Ludmiła Wiśniewska,\n",
       " Zbigniew Nowak,\n",
       " Łukasz Kowalski,\n",
       " Żaneta Jabłońska]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from typing import NamedTuple\n",
    "\n",
    "\n",
    "class Person(NamedTuple):\n",
    "    first_name: str\n",
    "    last_name: str\n",
    "\n",
    "    def __repr__(self):\n",
    "        return f\"{self.first_name} {self.last_name}\"\n",
    "\n",
    "\n",
    "people = [\n",
    "    Person(\"Zbigniew\", \"Nowak\"),\n",
    "    Person(\"Anna\", \"Wójcik\"),\n",
    "    Person(\"Łukasz\", \"Kowalski\"),\n",
    "    Person(\"Żaneta\", \"Jabłońska\"),\n",
    "    Person(\"Anna\", \"Nowak\"),\n",
    "    Person(\"Ludmiła\", \"Wiśniewska\"),\n",
    "]\n",
    "\n",
    "sorted(people)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "56c08bf3-a482-4d7d-929f-61cb51c8f3c4",
   "metadata": {},
   "source": [
    "### pyuca"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "9659e59f-6071-42a2-b822-61c7e5abdd69",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Anna Nowak,\n",
       " Anna Wójcik,\n",
       " Ludmiła Wiśniewska,\n",
       " Łukasz Kowalski,\n",
       " Żaneta Jabłońska,\n",
       " Zbigniew Nowak]"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pyuca\n",
    "\n",
    "collator = pyuca.Collator()\n",
    "\n",
    "\n",
    "def compound_key(person: Person) -> tuple:\n",
    "    return (\n",
    "        collator.sort_key(person.first_name),\n",
    "        collator.sort_key(person.last_name),\n",
    "    )\n",
    "\n",
    "\n",
    "sorted(people, key=compound_key)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "33cf460f-6937-4566-b4f0-9d06d5a8b29c",
   "metadata": {},
   "source": [
    "### PyICU"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "5be4e5d9-42b3-4369-9cb8-eb18adff4144",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Anna Nowak,\n",
       " Anna Wójcik,\n",
       " Ludmiła Wiśniewska,\n",
       " Łukasz Kowalski,\n",
       " Zbigniew Nowak,\n",
       " Żaneta Jabłońska]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from icu import Collator, Locale\n",
    "\n",
    "collator = Collator.createInstance(Locale(\"pl_PL\"))\n",
    "\n",
    "\n",
    "def compound_key(person: Person) -> tuple:\n",
    "    return (\n",
    "        collator.getSortKey(person.first_name),\n",
    "        collator.getSortKey(person.last_name),\n",
    "    )\n",
    "\n",
    "\n",
    "sorted(people, key=compound_key)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
